# This code scrapes a few pages on the IQSS blog

# grab some libraries that are useful
# you may have to install some of these.  It can be a little tricky.
import mechanize
import urllib2
import time
import random
# Beautiful Soup is an easier install than you think.  Just grab the latest
# version of the .py file from the online repository and put it in your
# ~/Lib/site-packages directory, in the Python folders
from BeautifulSoup import BeautifulSoup as bs

# these are the parts of the url we are going to hit
# the first part stays the same
baseurlpart1 = "http://www.iq.harvard.edu/blog/sss/archives/2010/"
# the second part is a list we will cycle through
baseurlpart2 = ["08","09","10"]

# This is the file path we are saving to
# CHANGE THIS!
basefilepath = "C:/Documents and Settings/Rich/Desktop/"

# Start the loop
# note that indexing in python starts with zero, in keeping with cs tradition
for i in range(0,3):
    if i%1 == 0:
        print i
    targeturl = baseurlpart1 + baseurlpart2[i]
    # this is one way to open webpages (urllib2) but it's commented out here
    #response = urllib2.urlopen(targeturl)
    #html = response.read()
    # This is a second way to open web pages, its what I like better
    br = mechanize.Browser()
    html = br.open(targeturl).get_data()
    filepath = basefilepath + baseurlpart2[i] +".txt"
    print(filepath)
    f = open(filepath, 'w')
    f.write(html)
    f.close()
    # wait a few seconds so that we don't use up all the bandwidth
    # right now this is commented out
    #secs=random.uniform(5,15)
    #time.sleep(secs)
else:
    print 'The scrape loop is over'

# then parse the html
for i in range(0,3):
    if i%1 == 0:
        print i
    filepath = basefilepath + baseurlpart2[i] +".txt"
    f = open(filepath, 'r')
    html = f.read()
    f.close()
    soup = bs(html)
    content = soup.findAll("div")
    # This pulls out the table that is just the text
    # this gets you started on real expressions
    myholder = []
    for j in content:
        j = str(j)
        if j.find("BlogContent")!=-1:
            myholder.append(j)
            
    filepath = basefilepath + baseurlpart2[i] + "content" +".txt"
    f = open(filepath, 'w')
    f.writelines("BLOG CONTENT:\n")
    f.writelines(myholder[3])
    f.close()

else :
    print 'The parsing loop is over'
    

		

